MegaMUGA Annotations

Reading in reference genotypes and metadata

First I read in the reference sample genotypes, as well as marker annotations from an analysis previously conducted by Karl Broman, Dan Gatti, and Belinda Cornes.

# Reading in reference sample genotype data
control_genotypes <- suppressWarnings(data.table::fread("../data/MMControls/control.genotypes.csv"))
colnames(control_genotypes)[1] <- "marker"

## Reading in marker annotations fro Broman, Gatti, & Cornes analysis
mm_metadata <- data.table::fread("../data/MMControls/mm_uwisc_v2.csv")

Marker QC: Searching for missing genotype calls

We searched for probes where many mice are missing genotype calls.

## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
  tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
  dplyr::group_by(marker, genotype) %>%
  dplyr::count() %>%
  # Result: number of genotype calls for each marker across all samples
  # i.e.
  #  marker            genotype     n
  #  B6_01-033811444-S A            8
  #  B6_01-033811444-S H            2
  #  B6_01-033811444-S N            1
  
  dplyr::ungroup() %>%
  dplyr::group_by(marker) %>%
  dplyr::mutate(freq = round(n/sum(n), 3),
                genotype = as.factor(genotype)) %>%
  # Result: allele frequency calls for each marker across all samples
  # i.e.
#  marker            genotype   n  freq
#  B6_01-033811444-S        A   8 0.022
#  B6_01-033811444-S        H   2 0.005
#  B6_01-033811444-S        N   1 0.003
#  B6_01-033811444-S        T 353 0.970
  dplyr::left_join(., mm_metadata)


## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
  dplyr::ungroup() %>%
  dplyr::filter(genotype == "N") %>%
  tidyr::pivot_wider(names_from = genotype, 
                     values_from = n) %>%
  dplyr::select(marker, chr, bp_grcm39, freq) %>%
  dplyr::mutate(chr = as.factor(chr))


## Identifying markers with missing genotypes at a frequency higher than the 95th percentile of "N" frequencies across all markers
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
  dplyr::filter(freq > cutoff)

Of 77808 markers, 54990 failed to genotype at least one sample, and 2750 markers failed to genotype at least 12.765% of samples.

Sample QC

Searching for samples with poor marker representation

In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains with identical names meant that genotype counts for each marker among them couldn’t be grouped and tallied, so determining no-call frequency occurred column-wise. Mouse over individual samples to see the number of markers with missing genotypes for each sample.

## Calculating the number of missing markers for each sample
n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)], 
                         MARGIN = 2, 
                         function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
  dplyr::rename(n.no.calls = n.calls.strains)
# n.no.calls                                   sample
# 2355                       (129S1/SvImJxA/J)F1f0056
# 2204                       (129S1/SvImJxA/J)F1f0056
# 2171                 (129S1/SvImJxC57BL/6J)F1f15916
# 2348                 (129S1/SvImJxC57BL/6J)F1m15914
# 2232                   (129S1/SvImJxCAST/EiJ)F1f005
# 2328                   (129S1/SvImJxCAST/EiJ)F1m002
# 2291                (129S1/SvImJxNOD/ShiLtJ)F1f0063
# 2197                 (129S1/SvImJxNZO/HILtJ)F1f0005
# 2316                 (129S1/SvImJxNZO/HILtJ)F1m0004


## Interactive plot of the number of missing genotypes for each sample.
bad_sample_cutoff <- quantile(n.calls.strains.df$n.no.calls, probs = seq(0,1,0.05))[20]
high.n.samples <- n.calls.strains.df %>%
  dplyr::filter(n.no.calls > bad_sample_cutoff)
sampleQC <- ggplot(n.calls.strains.df, 
                   mapping = aes(x = reorder(sample,n.no.calls), 
                                 y = n.no.calls,
                                 text = paste("Sample:", sample))) + 
  geom_point() +
  QCtheme + 
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  geom_hline(yintercept = bad_sample_cutoff, linetype = 2) + 
  labs(x = "Number of mice with missing genotypes",
       y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.

Validating sex of reference samples

We next validated the sexes of each sample using sex chromosome probe intensities. We paired up probe intensities, joined available metadata, and filtered down to only markers covering the X and Y chromosomes.

## Reading in genotype intensities
x_intensities <- suppressWarnings(data.table::fread("../data/MMControls/control.X.csv"))
colnames(x_intensities)[1] <- "marker"
y_intensities <- suppressWarnings(data.table::fread("../data/MMControls/control.Y.csv"))
colnames(y_intensities)[1] <- "marker"

## Check to see if dimensions of intensity tables are identical, marker orders identical, and sample orders identical
if((unique(dim(x_intensities) == dim(y_intensities)) && 
   unique(colnames(x_intensities) == colnames(y_intensities)) &&
   unique(x_intensities$marker == y_intensities$marker)) == TRUE){
     ## Pivoting the data longer
  x_int_long <- x_intensities %>%
  tidyr::pivot_longer(cols = -marker, 
                      names_to = "sample", 
                      values_to = "x_int")
  y_int_long <- y_intensities %>%
  tidyr::pivot_longer(cols = -marker, 
                      names_to = "sample", 
                      values_to = "y_int")
  
  long_intensities <- cbind(x_int_long, y_int_long)
} else {
     print("Source intensity data frames have non-identical structure; exiting")
}

## Joining slimmer intensity files with marker metadata and reducing to markers on sex chromosomes
long_XY_intensities <- long_intensities[,c(1,2,3,6)] %>%
  dplyr::left_join(., mm_metadata) %>%
  dplyr::filter(chr %in% c("X","Y"))

# Expected output
# Note: rows 1 and 2 demonstrate duplicate sample names with unique probe intensities (x_int, y_int).
# marker                       sample x_int y_int chr   bp_mm10 bp_grcm39   cM_cox strand snp
# XiD1       (129S1/SvImJxA/J)F1f0056 1.161 0.094   X 102827921 101871527 44.17434   plus  TG
# XiD1       (129S1/SvImJxA/J)F1f0056 1.034 0.054   X 102827921 101871527 44.17434   plus  TG
# XiD1 (129S1/SvImJxC57BL/6J)F1f15916 0.805 0.068   X 102827921 101871527 44.17434   plus  TG
# XiD1 (129S1/SvImJxC57BL/6J)F1m15914 0.371 0.035   X 102827921 101871527 44.17434   plus  TG
# XiD1   (129S1/SvImJxCAST/EiJ)F1f005 0.696 0.040   X 102827921 101871527 44.17434   plus  TG
# XiD1   (129S1/SvImJxCAST/EiJ)F1m002 0.591 0.041   X 102827921 101871527 44.17434   plus  TG
# unique unmapped                                            probe strand_flipped
# TRUE    FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC          FALSE
# TRUE    FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC          FALSE
# TRUE    FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC          FALSE
# TRUE    FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC          FALSE
# TRUE    FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC          FALSE
# TRUE    FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC          FALSE

Then we flagged markers with high missingness across all samples, as well as samples with high missingness among all markers.

## Flagging markers and samples based on previous QC steps
flagged_XY_intensities <- long_XY_intensities %>%
  dplyr::mutate(marker_flag = dplyr::if_else(condition = marker %in% above.cutoff$marker,
                                             true = "FLAG",
                                             false = "")) %>%
  dplyr::mutate(high_missing_sample = dplyr::if_else(condition = sample %in% high.n.samples$sample,
                                                     true = "FLAG",
                                                     false = ""))

The first round of inferring predicted sexes used a rough search of the sample name for expected nomenclature convention, which includes a sex denotation.

## First round of predicted sex inference
## Input: flagged XY intensities
prelim.predicted.sexes <- flagged_XY_intensities %>%
  dplyr::mutate(bg = dplyr::case_when(stringr::str_detect(string = sample, 
                                                          pattern = "F1") == TRUE ~ "F1",
                                      TRUE ~ as.character(sample)),
                predicted.sex = dplyr::case_when(stringr::str_detect(string = sample, 
                                                          pattern = "F1f") == TRUE ~ "f",
                                      stringr::str_detect(string = sample, 
                                                          pattern = "F1m") == TRUE ~ "m",
                                      TRUE ~ "unknown"))
## Output: flagged intensities with preliminary sex predictions and background assignments among F1 hybrids
# prelim.predicted.sexes[764:789,] %>% 
#       dplyr::select(marker, sample, marker_flag, high_missing_sample, predicted.sex)
#     marker                       sample marker_flag high_missing_sample predicted.sex
# 764   XiE2 (C57BL/6JxNOD/ShiLtJ)F1f0018        FLAG                                 f
# 765   XiE2  (C57BL/6JxNZO/HILtJ)F1f0016        FLAG                                 f
# 766   XiE2 (C57BL/6JxNZO/HILtJ)F1m15853        FLAG                                 m
# 767   XiE2     (C57BL/6JxPWK/PhJ)F1f002        FLAG                                 f
# 768   XiE2     (C57BL/6JxPWK/PhJ)F1m005        FLAG                                 m
# 769   XiE2     (C57BL/6JxPWK/PhJ)F1m005        FLAG                                 m
# 770   XiE2     (C57BL/6JxSJL/J)F1m35973        FLAG                FLAG             m
# 771   XiE2    (C57BL/6JxWSB/EiJ)F1f0300        FLAG                                 f
# 772   XiE2   (C57BL/6JxWSB/EiJ)F1m15714        FLAG                                 m
# 773   XiE2 (CAST/EiJx129S1/SvImJ)F1f012        FLAG                                 f
# 774   XiE2 (CAST/EiJx129S1/SvImJ)F1m001        FLAG                                 m
# 775   XiE2         (CAST/EiJxA/J)F1f002        FLAG                                 f
# 776   XiE2         (CAST/EiJxA/J)F1f002        FLAG                                 f
# 777   XiE2         (CAST/EiJxA/J)F1m005        FLAG                                 m
# 778   XiE2       (CAST/EiJxC57BL/6J)F1m        FLAG                                 m
# 779   XiE2       (CAST/EiJxC57BL/6J)F1m        FLAG                                 m
# 780   XiE2  (CAST/EiJxNOD/ShiLtJ)F1f007        FLAG                                 f
# 781   XiE2  (CAST/EiJxNOD/ShiLtJ)F1f007        FLAG                                 f
# 782   XiE2      (CAST/EiJxNZO/HILtJ)F1f        FLAG                                 f
# 783   XiE2      (CAST/EiJxNZO/HILtJ)F1f        FLAG                                 f
# 784   XiE2      (CAST/EiJxNZO/HILtJ)F1m        FLAG                                 m
# 785   XiE2      (CAST/EiJxNZO/HILtJ)F1m        FLAG                                 m
# 786   XiE2     (CAST/EiJxPWK/PhJ)F10123        FLAG                           unknown
# 787   XiE2    (CAST/EiJxPWK/PhJ)F1f0163        FLAG                                 f
# 788   XiE2    (CAST/EiJxWSB/EiJ)F1f0113        FLAG                                 f
# 789   XiE2    (CAST/EiJxWSB/EiJ)F1m0096        FLAG                                 m

## Filtering down to samples without preliminary sex predictions
unknown <- prelim.predicted.sexes %>%
  dplyr::filter(predicted.sex == "unknown")

## Using regex searching of sample IDs to deduce the sex of each sample

###########################################################
## Key processes and expected outputs at each iteration: ##
###########################################################

#####################################################################
## 1) Extracting the a substring of X digits into the sample name. ##
#####################################################################
# mouse.id.X = stringr::str_sub(sample, -X)
# i.e.)
# unknown %>% 
#       dplyr::mutate(mouse.id.3 = stringr::str_sub(sample, -3)) %>% 
#       dplyr::select(sample, mouse.id.3) %>% 
#       head(10)
#                      sample mouse.id.3
# 1  (CAST/EiJxPWK/PhJ)F10123        123
# 2                 017-FH-F1        -F1
# 3        124S4/SvJaeJm39510        510
# 4           129P1/ReJm35858        858
# 5             129P2/OlaHsdm        sdm
# 6             129P2/OlaHsdm        sdm
# 7             129P3/Jm37959        959
# 8              129S1/SvImJf        mJf
# 9              129S1/SvImJf        mJf
# 10         129S1/SvImJf0827        827

#####################################################################################
## 2) Assigning the predicted sex based on expected mouse nomenclature convention. ##
#####################################################################################
# predicted.sex.X = dplyr::case_when(stringr::str_sub(mouse.id.X, 1, 1) %in% c("m","M") ~ "m", stringr::str_sub(mouse.id.X, 1, 1) %in% c("f","F") ~ "f", TRUE ~ "unknown")
# i.e.)
# unknown %>% 
#         dplyr::mutate(mouse.id.5= stringr::str_sub(sample, -5),
#                       predicted.sex.5 = 
#                     dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",                                     stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",                                     TRUE ~ "unknown")) %>% 
#         dplyr::select(sample, mouse.id.5, predicted.sex.5) %>% head(14)
#                      sample mouse.id.5 predicted.sex.5
# 1  (CAST/EiJxPWK/PhJ)F10123      10123         unknown
# 2                 017-FH-F1      FH-F1               f
# 3        124S4/SvJaeJm39510      39510         unknown
# 4           129P1/ReJm35858      35858         unknown
# 5             129P2/OlaHsdm      aHsdm         unknown
# 6             129P2/OlaHsdm      aHsdm         unknown
# 7             129P3/Jm37959      37959         unknown
# 8              129S1/SvImJf      vImJf         unknown
# 9              129S1/SvImJf      vImJf         unknown
# 10         129S1/SvImJf0827      f0827               f
# 11         129S1/SvImJf0827      f0827               f
# 12             129S1/SvImJm      vImJm         unknown
# 13             129S1/SvImJm      vImJm         unknown
# 14         129S1/SvImJm1314      m1314               m

##############################################################################################
# 3) Inferring the strain background by removing the mouse id from the sample name when a sex is predicted. In certain cases, symbols had to be extracted prior to sex and background inference.
##############################################################################################
# bg = if_else(condition = (predicted.sex.X == "m" | predicted.sex.X == "f"), 
#                           true = str_replace(string = bg, 
#                                              pattern = mouse.id.X, 
#                                              replacement = ""), 
#                           false = bg)
# i.e.) 
# unknown %>% 
#     dplyr::mutate(mouse.id.5 = stringr::str_sub(sample, -5), 
#                   mouse.id.5 = stringr::str_replace(string = mouse.id.5, 
#                                                     pattern = "[:symbol:]", 
#                                                     replacement = ""),
#                   predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
#                                                      stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
#                                                      TRUE ~ "unknown"),
#                   bg = if_else(condition = (predicted.sex.5 == "m" | predicted.sex.5 == "f"),
#                               true = str_replace(string = bg, 
#                                                  pattern = mouse.id.5, 
#                                                  replacement = ""), 
#                               false = bg)) %>% 
#     dplyr::select(sample, mouse.id.5, predicted.sex.5, bg) %>% head(14)
#                      sample mouse.id.5 predicted.sex.5                 bg
# 1  (CAST/EiJxPWK/PhJ)F10123      10123         unknown                 F1
# 2                 017-FH-F1      FH-F1               f                 F1
# 3        124S4/SvJaeJm39510      39510         unknown 124S4/SvJaeJm39510
# 4           129P1/ReJm35858      35858         unknown    129P1/ReJm35858
# 5             129P2/OlaHsdm      aHsdm         unknown      129P2/OlaHsdm
# 6             129P2/OlaHsdm      aHsdm         unknown      129P2/OlaHsdm
# 7             129P3/Jm37959      37959         unknown      129P3/Jm37959
# 8              129S1/SvImJf      vImJf         unknown       129S1/SvImJf
# 9              129S1/SvImJf      vImJf         unknown       129S1/SvImJf
# 10         129S1/SvImJf0827      f0827               f        129S1/SvImJ
# 11         129S1/SvImJf0827      f0827               f        129S1/SvImJ
# 12             129S1/SvImJm      vImJm         unknown       129S1/SvImJm
# 13             129S1/SvImJm      vImJm         unknown       129S1/SvImJm
# 14         129S1/SvImJm1314      m1314               m        129S1/SvImJ

digit.trim <- unknown %>% 
  dplyr::mutate(mouse.id.1 = stringr::str_sub(sample, -1),
                predicted.sex.1 = dplyr::case_when(stringr::str_sub(mouse.id.1, 1, 1) %in% c("m","M") ~ "m",
                                                   stringr::str_sub(mouse.id.1, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.1 == "m" | predicted.sex.1 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.1, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.3= stringr::str_sub(sample, -3),
                predicted.sex.3 = dplyr::case_when(stringr::str_sub(mouse.id.3, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.3, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.3 == "m" | predicted.sex.3 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.3, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.4 = stringr::str_sub(sample, -4),
                predicted.sex.4 = dplyr::case_when(stringr::str_sub(mouse.id.4, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.4, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.4 == "m" | predicted.sex.4 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.4, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.5 = stringr::str_sub(sample, -5),
                mouse.id.5 = stringr::str_replace(string = mouse.id.5,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:symbol:]", 
                                                  replacement = ""),
                predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.5 == "m" | predicted.sex.5 == "f"),
                             true = str_replace(string = bg,
                                                pattern = mouse.id.5,
                                                replacement = ""),
                             false = bg),
                
                mouse.id.6 = stringr::str_sub(sample, -6),
                mouse.id.6 = stringr::str_replace(string = mouse.id.6,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:punct:]", 
                                                  replacement = ""),
                mouse.id.6 = stringr::str_replace(string = mouse.id.6,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:symbol:]", 
                                                  replacement = ""),
                predicted.sex.6 = dplyr::case_when(stringr::str_sub(mouse.id.6, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.6, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.6 == "m" | predicted.sex.6 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.6, 
                                                replacement = ""), 
                             false = bg)) %>%
  dplyr::mutate(predicted.sex = dplyr::case_when(predicted.sex.1 == "m" ~ "m", 
                                                 predicted.sex.3 == "m" ~ "m",
                                                 predicted.sex.4 == "m" ~ "m",
                                                 predicted.sex.5 == "m" ~ "m",
                                                 predicted.sex.6 == "m" ~ "m",
                                                 
                                                 predicted.sex.1 == "f" ~ "f", 
                                                 predicted.sex.3 == "f" ~ "f",
                                                 predicted.sex.4 == "f" ~ "f",
                                                 predicted.sex.5 == "f" ~ "f",
                                                 predicted.sex.6 == "f" ~ "f",
                                                 TRUE ~ "unknown"))

# Removing previously "unknown" samples from initial results and binding newly inferred samples
predicted.sexes.strings <- prelim.predicted.sexes %>%
  dplyr::filter(predicted.sex != "unknown") %>%
  dplyr::bind_rows(.,digit.trim)

## Taking the first marker as a sample and tabulating the number of samples for each predicted sex
predicted.sex.table <- predicted.sexes.strings %>%
  dplyr::filter(marker %in% unique(prelim.predicted.sexes$marker)[1]) %>%
  dplyr::select(sample, predicted.sex, bg) %>%
  dplyr::group_by(predicted.sex) %>%
  dplyr::count() 

This captured 94 female samples, 249 male samples, leaving 21 samples of unknown predicted sex from nomenclature alone. These samples (below) exhibit a range of naming irregularities. Note there are triplicate KOMP cell DNA JM8 samples.

# Table of samples for which sex could not be predicted from sample name alone. Using one marker is fine as an example as the sample info for each marker is identical.
predicted.sexes.strings %>%
  dplyr::filter(predicted.sex == "unknown",
                marker == predicted.sexes.strings$marker[[1]]) %>%
  dplyr::select(sample, bg)
##               sample                bg
## 1           34-HG-F1                F1
## 2           36-PG-F1                F1
## 3             BAG74u            BAG74u
## 4              BAG99             BAG99
## 5           CAST/EiJ          CAST/EiJ
## 6               IN13              IN13
## 7               IN25              IN25
## 8               IN34              IN34
## 9               IN40              IN40
## 10              IN47              IN47
## 11              IN54              IN54
## 12 KOMP cell DNA JM8 KOMP cell DNA JM8
## 13 KOMP cell DNA JM8 KOMP cell DNA JM8
## 14 KOMP cell DNA JM8 KOMP cell DNA JM8
## 15           MWN1026           MWN1026
## 16           MWN1030           MWN1030
## 17           MWN1194           MWN1194
## 18           MWN1198           MWN1198
## 19          MWN1214u          MWN1214u
## 20           MWN1279           MWN1279
## 21           MWN1287           MWN1287

After predicting the sexes of the vast majority of reference samples, we visualized the average probe intensity among X Chromosome markers for each sample, labeling them by predicted sex. Samples colored black were unabled to have their sex inferred by the sample name, but cluster well with mice for which sex could be inferred. Conversely, some samples’ predicted sex is discordant with X and Y Chromosome marker intensities (i.e. blue samples that cluster with mostly orange samples, and vice versa). Mouse over individual dots to view the sample, as well as whether it was flagged for having many markers with missing genotype information.

# Input: Sex chromosome probe intensities for each marker with 1) marker metdata, 2) marker and sample flags, 3) background and sex predictions

Xchr.int <- predicted.sexes.strings %>%
  dplyr::ungroup() %>%
  dplyr::filter(marker_flag != "FLAG",
                chr == "X") %>%
  dplyr::mutate(x.chr.int = x_int + y_int) %>%
  dplyr::group_by(sample, predicted.sex, high_missing_sample) %>%
  dplyr::summarise(mean.x.chr.int = mean(x.chr.int))
# Expected output: Sample-averaged summed x- and y-channel probe intensities for all chromosome X markers. Note: replicated sample information collapses at this step. This is tolerable under the assumption that the samples with identical names are in fact duplicates of the same individual.

# sample                          predicted.sex high_missing_sample mean.x.chr.int
# <chr>                           <chr>         <chr>                        <dbl>
# (129S1/SvImJxA/J)F1f0056        f             ""                           1.02 
# (129S1/SvImJxC57BL/6J)F1f15916  f             ""                           1.04 
# (129S1/SvImJxC57BL/6J)F1m15914  m             ""                           0.781
# (129S1/SvImJxCAST/EiJ)F1f005    f             ""                           1.19 
# (129S1/SvImJxCAST/EiJ)F1m002    m             ""                           0.817
# (129S1/SvImJxNOD/ShiLtJ)F1f0063 f             ""                           1.05 

Ychr.int <- predicted.sexes.strings %>%
  dplyr::ungroup() %>%
  dplyr::filter(marker_flag != "FLAG",
                chr == "Y") %>%
  dplyr::group_by(sample, predicted.sex, high_missing_sample) %>%
  dplyr::summarise(mean.y.int = mean(y_int))
# Expected output: Sample-averaged y-channel probe intensities for all chromosome Y markers. Note: replicated sample information collapses at this step. This is tolerable under the assumption that the samples with identical names are in fact duplicates of the same individual.

# sample                          predicted.sex high_missing_sample mean.y.int
# <chr>                           <chr>         <chr>                    <dbl>
# (129S1/SvImJxA/J)F1f0056        f             ""                      0.0543
# (129S1/SvImJxC57BL/6J)F1f15916  f             ""                      0.0644
# (129S1/SvImJxC57BL/6J)F1m15914  m             ""                      0.400 
# (129S1/SvImJxCAST/EiJ)F1f005    f             ""                      0.0454
# (129S1/SvImJxCAST/EiJ)F1m002    m             ""                      0.167 
# (129S1/SvImJxNOD/ShiLtJ)F1f0063 f             ""                      0.071 


# Column binding the two intensities if the sample information matches
if(unique(Xchr.int$sample == Ychr.int$sample) == TRUE){
  sex.chr.intensities <- cbind(Xchr.int,Ychr.int$mean.y.int)
  colnames(sex.chr.intensities) <- c("sample","predicted.sex","bad_sample","sumxy_int","y_int")
}

# Interactive visualization of sex prediction results. Sample are colored according to predicted sex. "Unknown" samples are plotted black, and flagged/bad samples are triangles.
predicted.sex.plot.palettes <- sex.chr.intensities %>%
  dplyr::ungroup() %>%
  dplyr::distinct(sample, predicted.sex, bad_sample) %>%
  dplyr::mutate(predicted.sex.palette = dplyr::case_when(predicted.sex == "f" ~ "#5856b7",
                                                         predicted.sex == "m" ~ "#eeb868",
                                                         predicted.sex == "unknown" ~ "black"))
predicted.sex.palette <- predicted.sex.plot.palettes$predicted.sex.palette
names(predicted.sex.palette) <- predicted.sex.plot.palettes$predicted.sex
mean.x.intensities.by.sex.plot <- ggplot(sex.chr.intensities, 
                                         mapping = aes(x = sumxy_int, 
                                                       y = y_int, 
                                                       colour = predicted.sex,
                                                       shape = bad_sample,
                                                       text = sample,
                                                       label = bad_sample)) + 
  geom_point() + 
  scale_colour_manual(values = predicted.sex.palette) + 
  # facet_grid(.~chr) +
  QCtheme
ggplotly(mean.x.intensities.by.sex.plot, 
         tooltip = c("text","label"))

By visual inspection, we found the following issues with either the sample name-based sex prediction or putative sex mix-ups:

  • 129S1/SvImJf: Predicted as male, plotted with females; this makes sense because following a 1 character search for sex classifier (yielding “f”) is a three character search (yielding “mJf”). Might need a specific fix for this sample. Almost certainly a female that was predicted incorrectly.

  • R26FIpe/FIpem997: Predicted male, plotted with females. Four character search would yield proper mouse ID, but clustered stereotypically with other female samples.

  • Nestin -/- Cre+m; TgIn -/- Cre+m: Predicted male, plotted with females.

  • (CAST/EiJxPWK/PhJ)F10123: Predicted female, plotted as male. Capitalized “M” or “F” were treated with equal weight as lowercase sex classifiers in mouse ID. Specific Chr X genotypes should reveal sex (i.e. do Chr X calls match CAST/EiJ or PWK/PhJ sample Chr X calls?)

  • 017-FH-F1: Predicted female, plotted as male. Capitalized “F” following five character search would have yielded this prediction. Even without proper nomenclature, the sample’s stereotypical clustering would suggest this sample is male.

  • (BALB/cByJxC57BL/6J)F1f31283; (BALB/cJxC57BL/6J)F1f36006: Predicted female, plotted as male. Samples are from very similar strain backgrounds. Specific Chr X genotypes should reveal sex (i.e. do Chr X calls match BALB/cJ samples?).

In addition, all samples for which sex could not be predicted based on sample ID clustered with one sex or the other based on probe intensity.

Because the split between inferred sexes of samples was so distinct, we used k-means clustering to quickly match the clusters to sexed samples and assign or re-assign sexes to samples with unknown or apparently incorrect sex information, respectively. Samples highlighted above were also re-evaluated using strain-specific marker information.

# Clear visual clustering of samples motivated us to use a rough clustering method to quickly assign groups to samples based on X and Y chromsome probe intensities. K-means clustering is below supplying two clusters for each sex.
# Inputs: 
# 1) Sample-averaged summed x- and y-channel probe intensities for all chromosome X markers
# 2) Sample-averaged y-channel probe intensities for all chromosome Y markers
rough_clusters <- kmeans(sex.chr.intensities[,4:5], centers = 2)$cluster

# Joining each sample's cluster assignment to the sample-averaged intensity metrics
sex.chr.k.means <- sex.chr.intensities %>%
  dplyr::ungroup() %>%
  dplyr::mutate(clust = as.factor(rough_clusters))

# Generating a contingency table for how each cluster paired with each sex. 
sex.by.cluster.tab <- sex.chr.k.means %>%
  dplyr::group_by(predicted.sex, clust) %>%
  dplyr::count() %>%
  dplyr::arrange(desc(n))

# The most common clusters should be the two sexes, k-means doesn't always assign the same cluster name to the same sex. Therefore, the top clusters must be pulled out and assigned sexes dynamically.
top.clusters <- sex.by.cluster.tab[1:2,] %>%
  dplyr::ungroup() %>%
  dplyr::mutate(inferred.sex = predicted.sex) %>%
  dplyr::select(-n,-predicted.sex)

# Samples are then recoded according to the k-means assigned sexes
reSexed_samples <- sex.chr.k.means %>%
  dplyr::select(-predicted.sex) %>%
  dplyr::left_join(.,top.clusters) %>%
  dplyr::left_join(sex.chr.intensities %>%
                     dplyr::select(sample, predicted.sex))

# Prints a table of all samples with an option to view whether a sample had its sex redesignated.
reSexed_samples_table <- reSexed_samples %>%
  dplyr::mutate(resexed = predicted.sex != inferred.sex)
reSexed_samples_table %>% 
  dplyr::select(sample, resexed, predicted.sex, inferred.sex) %>%
  DT::datatable(., filter = "top", 
              escape = FALSE)

The plot below demonstrates that this clustering technique does a pretty good job at capturing the information we want. Moving forward with sample QC we used the reassigned inferred sexes of the samples.

# Interactive scatter plot of intensities similar to above, but recolors and outlines samples based on redesignated sexes.
reSexed.plot <- ggplot(reSexed_samples_table %>%
         dplyr::arrange(predicted.sex),
       mapping = aes(x = sumxy_int, 
                     y = y_int, 
                     fill = inferred.sex,
                     colour = predicted.sex,
                     text = sample,
                     label = resexed,
                     label2  = bad_sample)) + 
  geom_point(shape = 21,size = 3, alpha = 0.7) + 
  scale_colour_manual(values =  predicted.sex.palette) +
  scale_fill_manual(values = c(unique(predicted.sex.palette)[1:2])) +
  QCtheme

ggplotly(reSexed.plot, 
         tooltip = c("text","label","label2"))

Validating reference sample genetic backgrounds

A key component of sample QC for our purposes is knowing that markers that we expect to deliver the consensus genotype (i.e. in a cross) actually provide us the correct strain information and allow us to correctly infer haplotypes.

# Vector of founder strain names
founders <- c("A/J","C57BL/6J","129S1/SvImJ","NOD/ShiLtJ",
              "NZO/HILtJ","CAST/EiJ","PWK/PhJ","WSB/EiJ")

# Re-flag genotypes based on bad markers or bad samples.
# Inputs:
# 1) All sample genotypes
# 2) marker metadata
# 3) flag cutoff tables
genos.flagged <- control_genotypes %>%
  tidyr::pivot_longer(-marker, 
                      names_to = "sample", 
                      values_to = "genotype") %>%
  dplyr::left_join(., mm_metadata) %>%
  # Flagging markers and samples
  dplyr::mutate(marker_flag = dplyr::if_else(condition = marker %in% above.cutoff$marker,
                                             true = "FLAG",
                                             false = ""),
                high_missing_sample = dplyr::if_else(condition = sample %in% high.n.samples$sample,
                                                     true = "FLAG",
                                                     false = ""))

# Join the sample table with resex information with each sample's strain background from initial sex prediction.
# Inputs:
# 1) Sample metadata, including sex
# 2) Sample strain background
sample.meta <- reSexed_samples_table %>%
  dplyr::select(sample, bad_sample, inferred.sex, resexed) %>%
  dplyr::left_join(predicted.sexes.strings %>%
                     dplyr::distinct(sample, bg) %>%
                     # Spot fix bg of one resexed 129S1 sample
                     dplyr::mutate(bg = if_else(bg == "129S1/SvIJm", 
                                                true = "129S1/SvImJ", 
                                                false = bg)))


# From the sample metadata, extract any sample derived from an inbred founder.
founder_samples <- sample.meta %>%
  dplyr::filter(bg %in% c(founders)) %>%
  dplyr::mutate(dam = bg,
                sire = bg)

# Extract any sample derived from a cross between different founder strains.
founder_F1_samples <- sample.meta %>%
  dplyr::filter(bg == "F1")

# Supply each sample strain background, pull out parental strains, and join with samples
parent.table <- purrr::map(founder_F1_samples$sample, extractParents) %>%
  Reduce(rbind, .)
founder_F1_samples_parents <- cbind(founder_F1_samples, parent.table)

# Bind together founder sample metadata and F1 sample metadata, removing a few instances where other crosses were tagged by string-based sex prediction
# Other instances:
# 017-FH-F1
# 34-HG-F1
# 36-PG-F1
# JF1/Msm35243
all_founder_samples_parents <- founder_samples %>%
  dplyr::bind_rows(., founder_F1_samples_parents) %>%
  dplyr::filter(!is.na(sire),
                sire %in% founders,
                dam %in% founders)

# Count up samples for each founder and resulting cross and display a table
# Dam names = row names; Sire name = column names
founder_sample_table <- all_founder_samples_parents %>%
  dplyr::group_by(dam,sire) %>%
  dplyr::count() %>%
  tidyr::pivot_wider(names_from = sire, values_from = n)
DT::datatable(founder_sample_table, escape = FALSE, 
              options = list(columnDefs = list(list(width = '20%', targets = c(8)))))

From the table, we can see that all possible pairwise combinations of CC/DO founder strains are represented, with the exception of (NZO/HILtJxCAST/EiJ)F1 and (NZO/HILtJxPWK/PhJ)F1. These missing samples could be interesting; these two crosses have been previously noted as “reproductively incompatible” in the literature.

# Join all genotypes to founder-derived samples, filter away bad markers, and reduce down to unique rows for each sample and marker genotype
# Inputs: 
# 1) Founder sample metadata (colnames(all_founder_samples_parents) = "sample"       "bad_sample"   "inferred.sex" "resexed"      "bg"           "dam"         "sire")
# 2) All sample genotypes with flag information
founder_sample_genos <- all_founder_samples_parents %>%
  dplyr::left_join(.,genos.flagged) %>%
  dplyr::filter(marker_flag != "FLAG",
                genotype != "N") %>%
  dplyr::distinct(sample, marker, genotype, inferred.sex, resexed, bad_sample, dam, sire)

# Some samples have multiple calls for the same marker genotype. At this step we identify these markers, and later we remove them to form consensus genotypes for each founder. 
multicalled_markers <- founder_sample_genos %>%
  dplyr::group_by(marker, sample) %>%
  dplyr::count() %>%
  dplyr::filter(n != 1)

First, we joined samples derived from CC/DO founders to their respective genotypes and removed markers with multiple genotype calls across the same sample. These observations make determining founder consensus genotypes complicated; we expect and desire a 1:1:1 relationship between each sample:marker:genotype. We counted 376 such markers. After removing these markers as well as those previously flagged, we constructed a rough dendrogram from good marker genotypes to determine whether samples cluster according to known relationships among founder strains. Edge colors represent rough clustering into six groups - three of which contain samples derived from wild-derived founder strains and their F1 hybrids with other founder strains.

# Creating a wide genotype table to compute the distance matrix for the dendrogram, filtering out the markers with multiple genotype calls per sample.
wide_founder_sample_genos <- founder_sample_genos %>%
  dplyr::filter(!marker %in% multicalled_markers$marker) %>%
  dplyr::select(sample, marker, genotype) %>%
  tidyr::pivot_wider(names_from = marker, values_from = genotype)

# Genotype calls at this point are still in letter form (i.e. A, T, or H for het). In order to calculate the distance matrix, we had to recode each marker's genotype information into 0, 1 for hets or 2. This process is applied column-wise.
recoded_wide_sample_genos <- suppressWarnings(data.frame(apply(wide_founder_sample_genos[,2:ncol(wide_founder_sample_genos)], 2, recodeCalls)))
rownames(recoded_wide_sample_genos) <- wide_founder_sample_genos$sample

# Scaling the genotype matrix, then calculating euclidean distance between all samples
dd <- dist(scale(recoded_wide_sample_genos), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")

# Plotting sample distances as a dendrogram
dend_colors = c("slateblue", # classical strains
                "blue", 
                qtl2::CCcolors[6:8]) # official colors for wild-derived CC/DO founder strains
clus8 = cutree(hc, 5)
plot(as.phylo(hc), type = "f", cex = 0.5, tip.color = dend_colors[clus8],
     no.margin = T, label.offset = 1, edge.width = 0.5)

From here we curated a set of genotypes for each CC/DO founder strain that were fixed across replicate samples from that strain. The genotypes for intersecting markers between two strains were combined “crossed” to form predicted genotypes for each F1 hybrid of CC/DO founders. Then, the genotypes of each CC/DO founder F1 hybrid were compared directly to what was predicted, and the concordance shown below is the proportion of markers of each individual that match this prediction.

# Loop creates a data frame of consensus genotype calls for each CC/DO founder strain
# In this case, "consensus" is designated by all samples of the same strain having identical genotype calls for the same marker
for(f in founders){
  
  print(paste("Generating Calls for",f))
  
  # Pulling the samples for each CC/DO founder strain
  founder.geno.array <- all_founder_samples_parents %>%
    dplyr::filter(bg == f) %>%
    # Attach all genotypes
    dplyr::left_join(.,genos.flagged, by = "sample") %>%
    # Use only high-quality markers
    dplyr::filter(marker_flag != "FLAG")

  
  # Count the number of unique allele calls for each marker
  founder.allele.counts <- founder.geno.array %>%
    dplyr::group_by(marker, genotype) %>%
    dplyr::count()
  
  # Determine which markers have identical calls across all samples from the same genetic background
  founder_genos <- founder.allele.counts %>%
    dplyr::filter(n == max(founder.allele.counts$n)) %>%
    dplyr::select(-n) %>%
    `colnames<-`(c("marker",f))
  
  # Assign these calls to a founder object
  assign(paste0("Calls_",f), founder_genos)

}
## [1] "Generating Calls for A/J"
## [1] "Generating Calls for C57BL/6J"
## [1] "Generating Calls for 129S1/SvImJ"
## [1] "Generating Calls for NOD/ShiLtJ"
## [1] "Generating Calls for NZO/HILtJ"
## [1] "Generating Calls for CAST/EiJ"
## [1] "Generating Calls for PWK/PhJ"
## [1] "Generating Calls for WSB/EiJ"
# Assemble a list of predicted genotypes for F1s by combining each founder-specific dataframe of calls
# First form a list of dams that comprise each F1 cross type
dams <- data.frame(tidyr::expand_grid(founders, founders)) %>%
  `colnames<-`(c("dams","sires")) %>%
  # Select crosses between strains
  filter(dams != sires) %>%
  dplyr::select(dams) %>%
  as.list()
# Loop through the list of cross types and pull in genotype call objects
dam_calls <- list()
for(i in 1:length(dams$dams)){
  dam_calls[[i]] <- get(ls(pattern = paste0("Calls_",dams$dams[i])))
}

# Do the same for the sires for each F1
sires <- data.frame(tidyr::expand_grid(founders, founders)) %>%
  `colnames<-`(c("dams","sires")) %>%
  filter(dams != sires) %>%
  dplyr::select(sires) %>%
  as.list()
sire_calls <- list()
for(i in 1:length(sires$sires)){
  sire_calls[[i]] <- get(ls(pattern = paste0("Calls_",sires$sires[i])))
}

# Compare the predicted F1 genotypes to the genotypes of each F1 sample
bg_QC <- purrr::map2(dam_calls, sire_calls, founder_background_QC)
## [1] "Running QC: (A/JxC57BL/6J)F1"
## [1] "Running QC: (A/Jx129S1/SvImJ)F1"
## [1] "Running QC: (A/JxNOD/ShiLtJ)F1"
## [1] "Running QC: (A/JxNZO/HILtJ)F1"
## [1] "Running QC: (A/JxCAST/EiJ)F1"
## [1] "Running QC: (A/JxPWK/PhJ)F1"
## [1] "Running QC: (A/JxWSB/EiJ)F1"
## [1] "Running QC: (C57BL/6JxA/J)F1"
## [1] "Running QC: (C57BL/6Jx129S1/SvImJ)F1"
## [1] "Running QC: (C57BL/6JxNOD/ShiLtJ)F1"
## [1] "Running QC: (C57BL/6JxNZO/HILtJ)F1"
## [1] "Running QC: (C57BL/6JxCAST/EiJ)F1"
## [1] "Running QC: (C57BL/6JxPWK/PhJ)F1"
## [1] "Running QC: (C57BL/6JxWSB/EiJ)F1"
## [1] "Running QC: (129S1/SvImJxA/J)F1"
## [1] "Running QC: (129S1/SvImJxC57BL/6J)F1"
## [1] "Running QC: (129S1/SvImJxNOD/ShiLtJ)F1"
## [1] "Running QC: (129S1/SvImJxNZO/HILtJ)F1"
## [1] "Running QC: (129S1/SvImJxCAST/EiJ)F1"
## [1] "Running QC: (129S1/SvImJxPWK/PhJ)F1"
## [1] "Running QC: (129S1/SvImJxWSB/EiJ)F1"
## [1] "Running QC: (NOD/ShiLtJxA/J)F1"
## [1] "Running QC: (NOD/ShiLtJxC57BL/6J)F1"
## [1] "Running QC: (NOD/ShiLtJx129S1/SvImJ)F1"
## [1] "Running QC: (NOD/ShiLtJxNZO/HILtJ)F1"
## [1] "Running QC: (NOD/ShiLtJxCAST/EiJ)F1"
## [1] "Running QC: (NOD/ShiLtJxPWK/PhJ)F1"
## [1] "Running QC: (NOD/ShiLtJxWSB/EiJ)F1"
## [1] "Running QC: (NZO/HILtJxA/J)F1"
## [1] "Running QC: (NZO/HILtJxC57BL/6J)F1"
## [1] "Running QC: (NZO/HILtJx129S1/SvImJ)F1"
## [1] "Running QC: (NZO/HILtJxNOD/ShiLtJ)F1"
## [1] "Running QC: (NZO/HILtJxCAST/EiJ)F1"
## [1] "Running QC: (NZO/HILtJxPWK/PhJ)F1"
## [1] "Running QC: (NZO/HILtJxWSB/EiJ)F1"
## [1] "Running QC: (CAST/EiJxA/J)F1"
## [1] "Running QC: (CAST/EiJxC57BL/6J)F1"
## [1] "Running QC: (CAST/EiJx129S1/SvImJ)F1"
## [1] "Running QC: (CAST/EiJxNOD/ShiLtJ)F1"
## [1] "Running QC: (CAST/EiJxNZO/HILtJ)F1"
## [1] "Running QC: (CAST/EiJxPWK/PhJ)F1"
## [1] "Running QC: (CAST/EiJxWSB/EiJ)F1"
## [1] "Running QC: (PWK/PhJxA/J)F1"
## [1] "Running QC: (PWK/PhJxC57BL/6J)F1"
## [1] "Running QC: (PWK/PhJx129S1/SvImJ)F1"
## [1] "Running QC: (PWK/PhJxNOD/ShiLtJ)F1"
## [1] "Running QC: (PWK/PhJxNZO/HILtJ)F1"
## [1] "Running QC: (PWK/PhJxCAST/EiJ)F1"
## [1] "Running QC: (PWK/PhJxWSB/EiJ)F1"
## [1] "Running QC: (WSB/EiJxA/J)F1"
## [1] "Running QC: (WSB/EiJxC57BL/6J)F1"
## [1] "Running QC: (WSB/EiJx129S1/SvImJ)F1"
## [1] "Running QC: (WSB/EiJxNOD/ShiLtJ)F1"
## [1] "Running QC: (WSB/EiJxNZO/HILtJ)F1"
## [1] "Running QC: (WSB/EiJxCAST/EiJ)F1"
## [1] "Running QC: (WSB/EiJxPWK/PhJ)F1"
# Keep outputs from the QC that are lists; if QC wasn't performed for a given background, the output was a character vector warning
founder_background_QC_tr <- bg_QC %>%
  purrr::keep(., is.list) %>%
  # Instead of having 56 elements of lists of two, have two lists of 56:
  # 1) All good genotypes from each cross with concordance values
  # 2) All concordance summaries for each cross
  purrr::transpose(.)

# Bind together all concordance summaries
founder_concordance_df <- Reduce(rbind, founder_background_QC_tr[[2]])
# If all markers for a given chromosome type were either concordant or discordant, NAs are returned
# This step assigns those NA values a 0
founder_concordance_df[is.na(founder_concordance_df)] <- 0
# Form concordance as a percentage
founder_concordance_df_2 <- founder_concordance_df %>%
  dplyr::mutate(concordance = MATCH/(MATCH + `NO MATCH`))
founder_concordance_df_2$alt_chr <- factor(founder_concordance_df_2$alt_chr,
                                           levels = c("Autosome","X","Y","M","Other"))

# Interactive plot of the genotype concordance for each sample across chromosome types
founder_palette <- qtl2::CCcolors
names(founder_palette) <- founders
founderQCplot <- ggplot(data = founder_concordance_df_2, mapping = aes(x = sample, 
                                                                       y = concordance,
                                                                       colour = sire,
                                                                       fill = dam)) + 
  geom_point(shape = 21, size = 4) + 
  scale_colour_manual(values = founder_palette) + 
  scale_fill_manual(values = founder_palette) +
  facet_grid(alt_chr~inferred.sex) +  
  QCtheme + 
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

ggplotly(founderQCplot)